from pyspark import SparkContext, SparkConf

#生成sparkcontext对象
conf = SparkConf().set("spark.cores.max", "4").setAppName("word count").setMaster("spark://10.0.0.252:7077")
sc = SparkContext(conf=conf)

# 读取HDFS文件
rdd = sc.textFile("hdfs://10.0.0.252:9099/file.txt")

def split_line(line):
    pairs = []
    for word in line:
        pairs.append((word, 1))

    return pairs

# 调用flatmap函数对每一行做处理
rdd = rdd.flatMap(f=split_line)

# 对RDD做分组，并对值做求和与排序
rdd = rdd.groupByKey().mapValues(lambda v: sum(v)).sortBy(lambda v: v[1], ascending=False)
for line in rdd.collect()[0:10]:
    print(line)
sc.stop()               
